Pokemon Data

#link: https://www.kaggle.com/abcsds/pokemon
pokemon <- read.csv("Pokemon.csv")

library(ggplot2)
library(car) #gives LeveneTest
library(EnvStats) #for Box-Cox transformations
library(MASS) # for model selecting
library(plotly)
library(knitr)
library(forcats)
library(rcompanion)
library(moments)
library(magick)
library(grid)

#we must convert Generation into a factor
pokemon$Generation <- as.factor(pokemon$Generation)
str(pokemon)
## 'data.frame':    800 obs. of  13 variables:
##  $ X.        : int  1 2 3 3 4 5 6 6 6 7 ...
##  $ Name      : Factor w/ 800 levels "Abomasnow","AbomasnowMega Abomasnow",..: 81 330 746 747 103 104 100 101 102 666 ...
##  $ Type.1    : Factor w/ 18 levels "Bug","Dark","Dragon",..: 10 10 10 10 7 7 7 7 7 18 ...
##  $ Type.2    : Factor w/ 19 levels "","Bug","Dark",..: 15 15 15 15 1 1 9 4 9 1 ...
##  $ Total     : int  318 405 525 625 309 405 534 634 634 314 ...
##  $ HP        : int  45 60 80 80 39 58 78 78 78 44 ...
##  $ Attack    : int  49 62 82 100 52 64 84 130 104 48 ...
##  $ Defense   : int  49 63 83 123 43 58 78 111 78 65 ...
##  $ Sp..Atk   : int  65 80 100 122 60 80 109 130 159 50 ...
##  $ Sp..Def   : int  65 80 100 120 50 65 85 85 115 64 ...
##  $ Speed     : int  45 60 80 80 65 80 100 100 100 43 ...
##  $ Generation: Factor w/ 6 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Legendary : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
#notice that mega pokemon are counted as their own (e.g., AbomasnowMega)

kable(head(pokemon,15))
X. Name Type.1 Type.2 Total HP Attack Defense Sp..Atk Sp..Def Speed Generation Legendary
1 Bulbasaur Grass Poison 318 45 49 49 65 65 45 1 False
2 Ivysaur Grass Poison 405 60 62 63 80 80 60 1 False
3 Venusaur Grass Poison 525 80 82 83 100 100 80 1 False
3 VenusaurMega Venusaur Grass Poison 625 80 100 123 122 120 80 1 False
4 Charmander Fire 309 39 52 43 60 50 65 1 False
5 Charmeleon Fire 405 58 64 58 80 65 80 1 False
6 Charizard Fire Flying 534 78 84 78 109 85 100 1 False
6 CharizardMega Charizard X Fire Dragon 634 78 130 111 130 85 100 1 False
6 CharizardMega Charizard Y Fire Flying 634 78 104 78 159 115 100 1 False
7 Squirtle Water 314 44 48 65 50 64 43 1 False
8 Wartortle Water 405 59 63 80 65 80 58 1 False
9 Blastoise Water 530 79 83 100 85 105 78 1 False
9 BlastoiseMega Blastoise Water 630 79 103 120 135 115 78 1 False
10 Caterpie Bug 195 45 30 35 20 20 45 1 False
11 Metapod Bug 205 50 20 55 25 25 30 1 False

Tables of Types

#we make tables to get an idea of the amount of pokemon by each type
type.table <- table(pokemon$Type.1, pokemon$Type.2)
kable(type.table)
Bug Dark Dragon Electric Fairy Fighting Fire Flying Ghost Grass Ground Ice Normal Poison Psychic Rock Steel Water
Bug 17 0 0 0 2 0 2 2 14 1 6 2 0 0 12 0 3 7 1
Dark 10 0 0 3 0 0 2 3 5 2 0 0 2 0 0 2 0 2 0
Dragon 11 0 0 0 1 1 0 1 6 0 0 5 3 0 0 4 0 0 0
Electric 27 0 0 1 0 1 0 1 5 1 1 0 1 2 0 0 0 3 1
Fairy 15 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0
Fighting 20 0 1 0 0 0 0 0 1 0 0 0 0 0 0 3 0 2 0
Fire 28 0 0 1 0 0 7 0 6 0 0 3 0 2 0 2 1 1 1
Flying 2 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Ghost 10 0 1 2 0 0 0 3 2 0 10 0 0 0 4 0 0 0 0
Grass 33 0 3 1 0 2 3 0 5 0 0 1 3 0 15 2 0 2 0
Ground 13 0 3 2 1 0 0 1 4 2 0 0 0 0 0 2 3 1 0
Ice 13 0 0 0 0 0 0 0 2 1 0 3 0 0 0 2 0 0 3
Normal 61 0 0 0 0 5 2 0 24 0 2 1 0 0 0 2 0 0 1
Poison 15 1 3 1 0 0 2 0 3 0 0 2 0 0 0 0 0 0 1
Psychic 38 0 1 0 0 6 3 1 6 1 1 0 0 0 0 0 0 0 0
Rock 9 2 2 2 0 3 1 0 4 0 2 6 2 0 0 2 0 3 6
Steel 5 0 0 1 0 3 1 0 1 4 0 2 0 0 0 7 3 0 0
Water 59 0 6 2 2 2 3 0 7 2 3 10 3 0 3 5 4 1 0
t1.table <- table(pokemon$Type.1)
kable(t1.table)
Var1 Freq
Bug 69
Dark 31
Dragon 32
Electric 44
Fairy 17
Fighting 27
Fire 52
Flying 4
Ghost 32
Grass 70
Ground 32
Ice 24
Normal 98
Poison 28
Psychic 57
Rock 44
Steel 27
Water 112
t2.table <- table(pokemon$Type.2)
kable(t2.table)
Var1 Freq
386
Bug 3
Dark 20
Dragon 18
Electric 6
Fairy 23
Fighting 26
Fire 12
Flying 97
Ghost 14
Grass 25
Ground 35
Ice 14
Normal 4
Poison 34
Psychic 33
Rock 14
Steel 22
Water 14
Above, we h ave a table showing the number of pokemon with an interaction between Type 1 and Type 2, type 1, and type 2.

Plots and Summary Stats

#see if generations have different numbers of Type 1's and base stat totals
ggplot(pokemon, aes(Type.1, Total, color = Generation)) + geom_boxplot() + theme(plot.subtitle = element_text(vjust = 1), 
    plot.caption = element_text(vjust = 1), 
    panel.grid.major = element_line(linetype = "blank"), 
    panel.background = element_rect(fill = "thistle1"), 
    plot.background = element_rect(fill = "cyan2")) +labs(title = "Stat Total by Type 1 and Generation", 
    x = "Type 1")

#now type 2:
ggplot(pokemon, aes(Type.2, Total, color = Generation)) + geom_boxplot() + theme(plot.subtitle = element_text(vjust = 1), 
    plot.caption = element_text(vjust = 1), 
    panel.grid.major = element_line(linetype = "blank"), 
    panel.background = element_rect(fill = "rosybrown1"), 
    plot.background = element_rect(fill = "aquamarine1")) +labs(title = "Stat Total by Type 2 and Generation", 
    x = "Type 2")

#view avg BST by generation in a plot
bst.by.gen <- aggregate(Total ~ Generation, data = pokemon, FUN = mean)
bst.by.gen.plot <- ggplot(bst.by.gen, aes(x=Generation, y=Total, group = 1)) + geom_point(col = "white") + geom_line(color="red")+ theme(plot.subtitle = element_text(vjust = 1), 
    plot.caption = element_text(vjust = 1), 
    panel.grid.major = element_line(linetype = "blank"), 
    panel.grid.minor = element_line(linetype = "blank"), 
    panel.background = element_rect(fill = "black"), 
    plot.background = element_rect(fill = "red")) + ylab("Average BST") + ggtitle("Average BST by Generation")

bst.by.gen.plot

#interactive version:
ggplotly(bst.by.gen.plot)
#
ggplot(pokemon, aes(Total, color=Generation, fill=Generation)) + geom_histogram() + theme(plot.subtitle = element_text(vjust = 1), 
    plot.caption = element_text(vjust = 1), 
    panel.background = element_rect(fill = "honeydew1"), 
    plot.background = element_rect(fill = "lightsteelblue1"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

### Plot frequency of each type
#fct_infreq orders from least to greatest for type I
ggplot(pokemon, aes(x=fct_infreq(Type.1))) + geom_bar(fill = "aquamarine", color = "red") + theme(plot.subtitle = element_text(vjust = 1), 
    plot.caption = element_text(vjust = 1), 
    panel.grid.minor = element_line(linetype = "blank"), 
    panel.background = element_rect(fill = "thistle1"), 
    plot.background = element_rect(fill = "orange1"))+labs(title = "Frequency of each Type I", 
    x = "Type I", y = "Count")

ggplot(pokemon, aes(x=fct_infreq(Type.2))) + geom_bar(fill = "aquamarine", color = "red") + theme(plot.subtitle = element_text(vjust = 1), 
    plot.caption = element_text(vjust = 1), 
    panel.grid.minor = element_line(linetype = "blank"), 
    panel.background = element_rect(fill = "ivory"), 
    plot.background = element_rect(fill = "tomato2"))+labs(title = "Frequency of each Type II", 
    x = "Type II", y = "Count")+labs(caption = "*Note that the empty label means no secondary type")

### Histograms for each stat by Generation

#make histograms

#hp
hist.hp <- ggplot(pokemon, aes(x=HP)) + geom_histogram(col = "black", fill = "red") + facet_grid(.~Generation) + ggtitle("Distribution of HP stat by Generation")

#atk
hist.atk <- ggplot(pokemon, aes(x=Attack)) + geom_histogram(col = "red", fill = "blue") + facet_grid(.~Generation) + ggtitle("Distribution of Attack stat by Generation")

#def
hist.def <- ggplot(pokemon, aes(x=Defense)) + geom_histogram(col = "blue", fill = "aquamarine") + facet_grid(.~Generation) + ggtitle("Distribution of Defense stat by Generation")

#sp atk
hist.spatk <- ggplot(pokemon, aes(x=Sp..Atk)) + geom_histogram(col = "grey", fill = "yellow") + facet_grid(.~Generation) + ggtitle("Distribution of Special Attack stat by Generation")

#sp def
hist.spdef <- ggplot(pokemon, aes(x=Sp..Def)) + geom_histogram(col = "blue", fill = "royalblue2") + facet_grid(.~Generation) + ggtitle("Distribution of Special Defense stat by Generation")

#spd
hist.spd <- ggplot(pokemon, aes(x=Speed)) + geom_histogram(col = "yellow", fill = "blue") + facet_grid(.~Generation) + ggtitle("Distribution of Speed stat by Generation")

#######
ggplotly(hist.hp)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(hist.atk)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(hist.def)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(hist.spatk)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(hist.spdef)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplotly(hist.spd)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#notice how they are all approximately normal and may contain some outliets

Base Stat Total (BST) Distribution

hist.bst <- ggplot(pokemon, aes(x=Total)) + geom_histogram(col = "green", fill = "purple") + facet_grid(.~Generation) + ggtitle("Distribution of Base Stat Total (BST) by Generation") + theme(plot.subtitle = element_text(vjust = 1), 
    plot.caption = element_text(vjust = 1), 
    panel.background = element_rect(fill = "honeydew1"), 
    plot.background = element_rect(fill = "antiquewhite"))

ggplotly(hist.bst)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The histogram suggests BST is approximately normally distributed across all generations.

One-Way ANOVA on BST using Generation

pokemon.lm.model <- lm(Total ~ Generation, data = pokemon)
pokemon.aov <- aov(pokemon.lm.model)
summary(pokemon.aov)
##              Df   Sum Sq Mean Sq F value Pr(>F)
## Generation    5   110928   22186   1.547  0.173
## Residuals   794 11387586   14342
#so no significant difference between at least one generation for BST
plot(pokemon.aov)

#our residuals look random, errors look normal, so this is good



#########################################
#One-Way ANOVA on Legendary and BST
legendary.model <- lm(Total ~ Legendary, data = pokemon)
legendary.aov <- aov(legendary.model)
summary(legendary.aov)
##              Df  Sum Sq Mean Sq F value Pr(>F)    
## Legendary     1 2894883 2894883   268.5 <2e-16 ***
## Residuals   798 8603631   10781                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#This suggest legendaries have a higher BST than non-legendaries

Visualize errors under the model

ggplot(pokemon, aes(pokemon.lm.model$residuals)) + geom_histogram(color = "seagreen2") + xlab("Residuals")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

This plot suggests our errors are skewed left, which violates the normality assumption of the residuals.

Diagnostics/Formal Assumption Testing

resid <- pokemon.lm.model$residuals
shapiro.wilk.test <- shapiro.test(resid)
shapiro.wilk.test
## 
##  Shapiro-Wilk normality test
## 
## data:  resid
## W = 0.9833, p-value = 6.529e-08
#so normality assumption is violated

brown.fors.test <- leveneTest(Total ~ Generation, data = pokemon, center = median)
brown.fors.test
#so equal variance by group condition is met

These results mean that although our equal variance condition is met for any reasonable value of \(\alpha\), our errors are not approximately normally distributed. Therefore, we will consider transforming our variables to satisfy ANOVA assumptions.

Transformations

#check skew of response
skewness(pokemon$Total)
## [1] 0.1522438
#try square rooting the Response Variable
transform.pok <- lm(sqrt(Total) ~ Generation, data = pokemon)
plot(transform.pok)

resid2 <- transform.pok$residuals
shapiro.wilk.test2 <- shapiro.test(resid2)
shapiro.wilk.test2
## 
##  Shapiro-Wilk normality test
## 
## data:  resid2
## W = 0.98315, p-value = 5.748e-08
#so assumptions not met

#now try log(10)
transform.pok2 <- lm(log10(Total) ~ Generation, data = pokemon)
plot(transform.pok2)

resid3 <- transform.pok2$residuals
shapiro.wilk.test3 <- shapiro.test(resid3)
shapiro.wilk.test3
## 
##  Shapiro-Wilk normality test
## 
## data:  resid3
## W = 0.97164, p-value = 2.379e-11
#assumptions not met

Since transformations did not satisfy ANOVA assumptions, we will use non-parametric methods for our hypothesis test.

Non-Parametric Testing

#add a rank column
pokemon$Rank = rank(pokemon$Total, ties = "average")

#get F-stat
F.OBS = summary(lm(Total ~ Generation, data = pokemon))$fstatistic["value"]

permuted.data = pokemon #So we don't overwrite the original data
permuted.data$Generation = sample(permuted.data$Generation, nrow(permuted.data), replace = FALSE) #Permuting the groups
Fi = summary(lm(Total ~ Generation, data = permuted.data))$fstatistic["value"]

#we will permute the data 3000 times
R = 3000
many.perms = sapply(1:R,function(i){
  permuted.data = pokemon #So we don't overwrite the original data
  permuted.data$Generation = sample(permuted.data$Generation, nrow(permuted.data), replace = FALSE) #Permuting the groups
  Fi = summary(lm(Total ~ Generation, data = permuted.data))$fstatistic["value"]
  return(Fi)
})

hist(many.perms, main = "Distribution for Permuted F values", xlab = "Fi")
points(y = 0, x = F.OBS, pch = 17)

#we have a distribution that is skewed right

#get p-val:
mean(many.perms >= F.OBS)
## [1] 0.1783333
#try w/ Kruskall-Wallis test to compare:

nt <- nrow(pokemon)
Ri <- aggregate(Rank ~ Generation, data = pokemon, mean)$Rank
SR.2 = var(pokemon$Rank)
ni = aggregate(Total ~ Generation, data = pokemon, length)$Total

KW.OBS = 1/SR.2*sum(ni*(Ri - (nt+1)/2)^2) #Note, this assumes you calculate ni and Ri above
R = 3000
many.perms.KW = sapply(1:R,function(i){
  permuted.data = pokemon #So we don't overwrite the original data
  permuted.data$Generation = sample(permuted.data$Generation, nrow(permuted.data), replace = FALSE) #Permuting the groups
  SR.2 = var(permuted.data$Rank)
  ni = aggregate(Rank ~ Generation, data = permuted.data,length)$Rank
  Ri = aggregate(Rank ~ Generation, data = permuted.data,mean)$Rank
  KW.i= 1/SR.2*sum(ni*(Ri - (nt+1)/2)^2) 
  return(KW.i)
})
#p-val for KW:
p.value = mean(many.perms.KW > KW.OBS)
p.value
## [1] 0.09633333

Both of these non-parametric tests lead to the conclusion that we cannot conclude the distribution of total (BST) is different for at least one generation.